# -*- coding: utf-8 -*-

import scrapy
import re
import datetime
from test1.items import aqbzh

class aqbzhspider(scrapy.Spider):
    name = 'aqbzh'

    #发送起始url的post请求
    def start_requests(self):
        yield scrapy.FormRequest('http://aqbzh.chinasafety.gov.cn:8080/wss/loginAction!dbqySearch.action',
                                formdata={
                                    'pageRequest.toPage':'1',
                                    'pageRequest.pageSize':'50'
                                },
                                callback=self.parse)

    #发送所有页面的post请求
    def parse(self,response):
        for i in range(1,617):
            yield scrapy.FormRequest('http://aqbzh.chinasafety.gov.cn:8080/wss/loginAction!dbqySearch.action',
                                    formdata={
                                        'pageRequest.toPage':str(i),
                                        'pageRequest.pageSize':'50'
                                    },
                                    callback=self.parse_info)

    #将需要的字段存入字典中
    def parse_info(self,response):
        tr_list = response.xpath("//div[@class='work_con_right']/form[@id='form1']/table/tr")
        #按行遍历列表
        for tr in tr_list:
            if not tr.xpath("th[@class='index']").extract():
                cominfo = aqbzh()
                cominfo['the_name_of_the_enterprise'] = tr.xpath("td[2]/text()").extract_first()
                cominfo['address'] = tr.xpath("td[3]/text()").extract_first()
                cominfo['the_legal_representative'] = tr.xpath("td[4]/text()").extract_first()
                cominfo['standard_grade'] = tr.xpath("td[@class='nborder']/div//text()").extract_first()
                cominfo['url'] = response.url
                cominfo['insert_time'] = str(datetime.datetime.now())
                yield cominfo
